ENV222 EXERCISE

R in statistics

Author

TC-tea

Published

2023.05.11

🚀

👉🏻Click to enter the ENV222 note section

0 Quiz 1

1.
The distribution is shown in Figure @ref(fig:box).

boxplot(airquality$Ozone)

2.
rticles
3.

plot(1, 
     type = 'n', 
     axes = FALSE, 
     xlab = expression(NH[3] * " (" * mu* "mol m"^-3 * ")"), 
     ylab = '')

dtf <- c('1', '2', '3')
library(latex2exp)
plot(dtf, pch = 16, las = 1, 
     xlab = TeX('$NH_3$ ($\\mu$mol m$^{-3}$)'), 
     xaxt = "n", yaxt ="n",
     line=0.4)

mymat <- matrix(c(1, 2, 3, 0), nrow = 2)
layout(mymat, widths = c(4, 1), heights = c(2, 1))  # Set the ratio between widths and heights
plot(iris$Sepal.Length, iris$Sepal.Width, pch=20, xlab='Sepal Length (cm)', ylab='Sepal Width (cm)', las=1)
boxplot(iris$Sepal.Length, pch=20, las=1, horizontal=T)
boxplot(iris$Sepal.Width, pch=20, las=2)

x <- 'pack my box with five dozen liquor jugs'
xsingle <- strsplit(x, split = "")[[1]]
xsingle <- xsingle[xsingle != " "]
length(xsingle)

[1] 32

mytab <- table(xsingle)
names(mytab[mytab == max(mytab)])

[1] "i" "o"

library(dplyr)
x <- "pack my box with five dozen liquor jugs"
char_freq <- gsub(" ", "", x) |>
    strsplit("") |>
    table() |>
    data.frame() |>
    arrange(desc(Freq))
head(char_freq, 5)

  Var1 Freq
1    i    3
2    o    3
3    e    2
4    u    2
5    a    1

xword <- strsplit(x, " ")[[1]]
xword[which.max(nchar(xword))]

[1] "liquor"

gsub('five', '5', x)

[1] "pack my box with 5 dozen liquor jugs"

[1] "English_United States.1252"

sum(diff(mydata$date) != 1)

[1] 0

mydata$year <- as.integer(format(mydata$date, '%Y'))

mydata$month <- format(mydata$date, '%B')

mydata$weekday <- format(mydata$date, '%A')

Result:

mydata

# A tibble: 65,533 × 13
   date                   ws    wd   nox   no2    o3  pm10   so2    co  pm25
   <dttm>              <dbl> <int> <int> <int> <int> <int> <dbl> <dbl> <int>
 1 1998-01-01 00:00:00  0.6    280   285    39     1    29  4.72  3.37    NA
 2 1998-01-01 01:00:00  2.16   230    NA    NA    NA    37 NA    NA       NA
 3 1998-01-01 02:00:00  2.76   190    NA    NA     3    34  6.83  9.60    NA
 4 1998-01-01 03:00:00  2.16   170   493    52     3    35  7.66 10.2     NA
 5 1998-01-01 04:00:00  2.4    180   468    78     2    34  8.07  8.91    NA
 6 1998-01-01 05:00:00  3      190   264    42     0    16  5.50  3.05    NA
 7 1998-01-01 06:00:00  3      140   171    38     0    11  4.23  2.26    NA
 8 1998-01-01 07:00:00  3      170   195    51     0    12  3.88  2.00    NA
 9 1998-01-01 08:00:00  3.36   170   137    42     1    12  3.35  1.46    NA
10 1998-01-01 09:00:00  3.96   170   113    39     2    12  2.92  1.20    NA
# ℹ 65,523 more rows
# ℹ 3 more variables: year <int>, month <chr>, weekday <chr>

0 Quiz 2

1.
C

2.
B

3.
Wilcoxon Rank-Sum test

H0: the medians of the PM2.5 concentration in two cities are equal.
H1: the medians of the PM2.5 concentration in two cities are not equal.

5.
There are no duplicated values, because the rank are all int

rank(c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69))

 [1]  6  8  4  5  7  3  1 10  2  9

rank(c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))

 [1]  5  1 10  4  9  2  3  8  6  7

6.
Test statistic value: W = 23

dtf <- data.frame(City_A = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69),
                  City_B = c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
test <- wilcox.test(dtf$City_A, dtf$City_B, conf.int=TRUE)
test$statistic

 W 
23

7.
Critical values of 95 percent confidence interval: 24, 76

dtf <- data.frame(conc = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69, 14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53),
                  city = c(rep("A", 10), rep("B", 10)))
m <- sum(dtf$city == "A")
n <- sum(dtf$city == "B")
U_critical  <- qwilcox(c(0.025, 0.975), m, n)
U_critical

[1] 24 76

dtf <- data.frame(City_A = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69),
                  City_B = c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
m <- nrow(dtf)
n <- nrow(dtf)
U_critical  <- qwilcox(c(0.025, 0.975), m, n)
U_critical

[1] 24 76

U <- 23
m <- 10
n <- 10
curve(dwilcox(x,  m, n), from = 0, to = 100, n = 101, ylab = 'Probability distribution  density', las = 1, type = 'S', ylim = c(0, 0.031))
abline(h = 0)
segments(x0 = c(U, U_critical), y0 = 0, x1 = c(U, U_critical), y1 =  dwilcox(c(U, U_critical), m, n), col = c('blue', 'red', 'red'))
legend('topright', legend = c('Distribution curve', 'Critical values', 'U  score'), col = c('black', 'red', 'blue'), lty = 1,cex=0.7)

9.
The P-value is 0.043

dtf <- data.frame(City_A = c(13.31, 13.67, 12.805, 13.1, 13.39, 12.375, 11.58, 14.58, 12.029, 13.69),
                  City_B = c(14.14, 12.54, 15.63, 13.66, 15.51, 13.23, 13.25, 15.2, 14.18, 14.53))
test <- wilcox.test(dtf$City_A, dtf$City_B, conf.int=TRUE)
round(test$p.value, 3)

[1] 0.043

10.
The P-value is smaller than 0.05, reject the H0 hypotheses.

11.
The two dataset medians are different at alpha=0.05. The City_A and City_B have significant different with the median PM2.5 concentrations.

12.
The mean and standard deviation of the approximate normal distribution are 50 and 13.22876

U_mu <- m * n / 2
U_mu

[1] 50

U_sd <- sqrt(m * n * (m + n + 1) / 12)
U_sd

[1] 13.22876

13.

curve(dnorm(x, U_mu, U_sd), from=0, to=110)

1 Rmarkdown Exercise

2 Character Exercise

3 Time Exercise

1. Today

Which day of the year is it?

Sys.time()

[1] "2023-05-11 12:27:12 CST"

Convert it as “Year, Month in English and Date, Weekday in English”, such as “2023, January 1, Sunday”.

format(Sys.Date(), format = '%Y, %B, %d, %A')

[1] "2023, May, 11, Thursday"

Suppose we get to know each other at 15:00, September 13, 2022. How many days have we known each other until now? How many hours? How many seconds?

t1 <- Sys.time()
t2 <- strptime( "2022-9-13 15:00:00", format="%Y-%m-%d %H:%M:%S" )
difftime(time1 = t1, time2 = t2, units = 'secs')

Time difference of 20726833 secs

difftime(time1 = t1, time2 = t2, units = 'days')

Time difference of 239.8939 days

difftime(time1 = t1, time2 = t2, units = 'hours')

Time difference of 5757.454 hours

The anniversary is September 13, 2023. What day is it? How about in 2024? Plot a graph for September 13 of each year, with the weekday as x and the year as y.

t3 <- as.Date( "9/13/2023", format="%m/%d/%Y" )
format(t3, format="%A")

[1] "Wednesday"

t4 <- as.Date( "9/13/2024", format="%m/%d/%Y" )
format(t4, format="%A")

[1] "Friday"

dates <- seq(as.Date("2020-09-13"), as.Date("2040-09-13"), by="year")
weekdays <- factor(weekdays(dates, abbreviate=TRUE), 
                   levels=c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"))
weekdays <- as.integer(format(dates, "%u"))
plot(weekdays, as.Date(dates, format="%Y" ), 
     xlab="Weekday", ylab="Year", main="September 13th, 2020-2040")

2. The airquality dataset

Create a new column “Date”, showing the date in the format of “Year-month-day”, such as “2023-01-31”.

air<-read.csv("data/airquality.csv")
day <- paste("2022-",air$Month,"-",air$Day,sep="")
air$Date <- as.Date(day, format="%Y-%m-%d")

Plot a graph for each atmospheric variable (i.e. ozone in ppb, solar radiation in W m^-2, wind speed in m s^-1, air temperature in °C) against the date.

air1<-na.omit(air)
subsets <- split(air1, air1$Month)
plots <- lapply(subsets, function(subset) {
  plot(subset$Day, subset$Ozone,ylab = "Ozone pbb" ,main = paste("Ozone, ", unique(subset$Month))
)})
  par(mfrow = c(2, 3))
  for (i in seq_along(plots)) {
    plot(plots[[i]])
  }
library(ggthemes)
library(latex2exp)
dtf <- airquality
dtf5 <- dtf %>% subset(Date < "1973-06-01")

ggplot(dtf5) +
    geom_point(aes(x = Date, y = Ozone))+
    labs(y = "Ozone (ppb)") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
    
ggplot(dtf5) +
    geom_point(aes(x = Date, y = Solar.R))+
    labs(y = TeX("$\\text{Solar Radiation } (W\\cdot m^{-2})$")) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(dtf5) +
    geom_point(aes(x = Date, y = Wind))+
    labs(y = TeX("$\\text{Wind speed }m\\cdot m^{-1}$")) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(dtf5) +
    geom_point(aes(x = Date, y = Temp))+
    labs(y = TeX("$\\text{Temperature }\\circ C$")) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

Create a new column “Weekday”, showing the day in the week (1 as Monday and 7 as Sunday).

# install.packages("lubridate")
library(lubridate)
dtf$Weekday <- wday(as.Date(dtf$Date), week_start = 1)

Calculate the mean values of each atmospheric variable for each weekday.

air1<-na.omit(air)
tapply(as.numeric(air1$Ozone),air1$Weekday,mean)

library(tidyverse)
mean_dtf <- dtf %>%
    drop_na() %>%
    group_by(Weekday) %>%
    summarise(Ozone = mean(Ozone),
            Solar.R = mean(Solar.R),
            Wind = mean(Wind),
            Temp = mean(Temp))

Plot a graph for each atmospheric variable with the mean values against the weekdays.

library(latex2exp)
library(patchwork)
mean_ozone <- ggplot(mean_dtf) +
    geom_line(aes(x = Weekday, y = Ozone)) +
    labs(y = "Ozone (ppb)") +
    theme_classic()

mean_solar <- ggplot(mean_dtf) +
    geom_line(aes(x = Weekday, y = Solar.R)) +
    labs(y = TeX("$\\text{Solar Radiation } (W\\cdot m^{-2})$")) +
    theme_classic()

mean_wind <- ggplot(mean_dtf) +
    geom_line(aes(x = Weekday, y = Wind))+
    labs(y = TeX("$\\text{Wind speed }m\\cdot m^{-1}$")) +
    theme_classic()

mean_temp <- ggplot(mean_dtf) +
    geom_line(aes(x = Weekday, y = Temp))+
    labs(y = TeX("$\\text{Temperature }\\circ C$")) +
    theme_classic()

3. The Smart Flux data set The smartflux.zip data set was obtained from the flux observation at the Outdoor Research and Teaching Station (ORTS), XJTLU. It was supposed to be an automatic continuous measurement at a half-hourly base, saved in daily files. However, there were missing data in the records for some reason. How many data files are missing? How many records are missing?

setwd("data/smartflux/") # Remeber to delete the first file
temp <- list.files() %>% 
    substr(start = 1, stop =10) %>%
    strptime(format = "%Y-%m-%d") %>%
    format("%Y-%m-%d")

missing_files <- 0
for(i in 1:(length(temp)-1)){
    diff <- as.vector(difftime(temp[i+1], temp[i], units = "days"))
    if(diff != 1){
        missing_files <- missing_files + diff -1
    }
}
print(missing_files)

temp <- list.files()
files <- lapply(temp, function(i){
    read.table(i, fill = TRUE)
})

missing_records <- 0
for(i in 1:length(files)){
    records <- length(files[[i]][3:nrow(files[[i]]),4])
    missing_records <- missing_records + 48 - records
}

missing_records <- missing_records + 48 * missing_files
print(missing_records)

4 Plot Exercise

I think the range of the colour is enough by using red to blue, purple is unnecessary which may lead the graph ugly.
1. Where are the x and y labs? Although information are simple, but we still need the basic elements.
2. The main problem is there do not have any unit in the y label. Further more, I think we’d better not to use this type of cover like p53-/-.
3. We’d better not to use 3D plot in our essay, especially for some plot which involving proportionality.
4. Those two figures can be combine together with different colour. And maybe there is one more thing that we can change the theme to blank which will make readers more easy to figure.

library(latex2exp)
plot(iris$Sepal.Length, iris$Petal.Length, pch = 16, las = 1, 
     xlab = TeX('D$_p$($\\mu$m)'), 
     ylab = expression(frac('dN', 'dlogD'[p] * '(cm' ^-3 * ')')))

mymat <- matrix(c(1, 2, 3, 4), nrow = 2)
layout(mymat, widths = c(1, 2, 3, 2), heights = c(2, 1, 1, 2))
plot(co2)
plot(co2)
plot(co2)
plot(co2)

There are some errors related to the figure edges, which prevent me from generating a PDF_book.
See Figure @ref(fig:figure1).

5 R Tidyverse

stocks <- data.frame(year   = c(2015, 2015, 2016, 2016),
                     half   = c(   1,    2,    1,    2),
                     return = c(1.88, 0.59, 0.92, 0.17))
stocks_wide <- stocks %>% 
  pivot_wider(names_from = half, values_from = return)
stocks_wide

# A tibble: 2 × 3
   year   `1`   `2`
  <dbl> <dbl> <dbl>
1  2015  1.88  0.59
2  2016  0.92  0.17

# because the column names 1999 and 2000 are not being correctly referenced. 
table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")

# A tibble: 6 × 3
  country     year   cases
  <chr>       <chr>  <dbl>
1 Afghanistan 1999     745
2 Afghanistan 2000    2666
3 Brazil      1999   37737
4 Brazil      2000   80488
5 China       1999  212258
6 China       2000  213766

# Because there are duplicate values(columns) in the name and names columns (Not tidy)
people <- tribble(
  ~name,             ~names,  ~values,
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)

preg <- tribble(
  ~pregnant, ~male, ~female,
  "yes",     NA,    10,
  "no",      20,    12
)
preg1 <- pivot_longer(preg,c("male","female"),names_to = "Gender",values_to = "Number")
preg1

# A tibble: 4 × 3
  pregnant Gender Number
  <chr>    <chr>   <dbl>
1 yes      male       NA
2 yes      female     10
3 no       male       20
4 no       female     12

5.
(1) To determine which plane (tailnum) has the worst on-time record, we can calculate the average departure delay for each plane and then sort the results in ascending order to find the plane with the highest average delay.

# install.packages("nycflights13")
library(dplyr)
library(nycflights13)
flights %>%
  group_by(tailnum) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay)) %>%
  head(1)

# A tibble: 1 × 2
  tailnum avg_dep_delay
  <chr>           <dbl>
1 N844MH            297

(2) We should depart at 5am, with a smallest time of dalay of 0.688 min on average.

flights %>%
  group_by(hour) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(avg_dep_delay) %>%
  head(2)

# A tibble: 2 × 2
   hour avg_dep_delay
  <dbl>         <dbl>
1     5         0.688
2     6         1.64

(3) To compute the total minutes of delay for each destination, we can group the flights by destination and calculate the total arrival and departure delays. And then we can add those two values together to get the total delay for each destination.

flights %>%
  group_by(dest) %>%
  summarise(total_delay = sum(arr_delay + dep_delay, na.rm = TRUE))

# A tibble: 105 × 2
   dest  total_delay
   <chr>       <dbl>
 1 ABQ          4603
 2 ACK          2983
 3 ALB         15819
 4 ANC            83
 5 ATL        399462
 6 AUS         45898
 7 AVL          4216
 8 BDL         10205
 9 BGR          9732
10 BHM         12345
# ℹ 95 more rows

flights %>%
  group_by(dest) %>%
  mutate(total_delay = arr_delay + dep_delay) %>%
  mutate(prop_delay = total_delay / sum(total_delay, na.rm = TRUE))

# A tibble: 336,776 × 21
# Groups:   dest [105]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 336,766 more rows
# ℹ 13 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>, total_delay <dbl>,
#   prop_delay <dbl>

(4)

(5) For each plane in each month, we can group the flights data by tailnum and month

flights %>%
  group_by(tailnum, month) %>%
  summarise(flights = n())

# A tibble: 37,988 × 3
# Groups:   tailnum [4,044]
   tailnum month flights
   <chr>   <int>   <int>
 1 D942DN      2       1
 2 D942DN      3       2
 3 D942DN      7       1
 4 N0EGMQ      1      41
 5 N0EGMQ      2      28
 6 N0EGMQ      3      30
 7 N0EGMQ      4      29
 8 N0EGMQ      5      13
 9 N0EGMQ      6      32
10 N0EGMQ      7      11
# ℹ 37,978 more rows

6 Post-hoc tests

They have a significant different of the mean sepal length at alpha level at 0.05

library(agricolae)
iris.aov <- aov(Sepal.Length ~ Species, data = iris)
summary(iris.aov)

             Df Sum Sq Mean Sq F value Pr(>F)    
Species       2  63.21  31.606   119.3 <2e-16 ***
Residuals   147  38.96   0.265                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

LSD.test(iris.aov, "Species", console = TRUE)


Study: iris.aov ~ "Species"

LSD t Test for Sepal.Length 

Mean Square Error:  0.2650082 

Species,  means and individual ( 95 %) CI

           Sepal.Length       std  r      LCL      UCL Min Max
setosa            5.006 0.3524897 50 4.862126 5.149874 4.3 5.8
versicolor        5.936 0.5161711 50 5.792126 6.079874 4.9 7.0
virginica         6.588 0.6358796 50 6.444126 6.731874 4.9 7.9

Alpha: 0.05 ; DF Error: 147
Critical Value of t: 1.976233 

least Significant Difference: 0.2034688 

Treatments with the same letter are not significantly different.

           Sepal.Length groups
virginica         6.588      a
versicolor        5.936      b
setosa            5.006      c

pairwise.t.test(iris$Sepal.Length, iris$Species, p.adjust.method = "bonferroni")


    Pairwise comparisons using t tests with pooled SD 

data:  iris$Sepal.Length and iris$Species 

           setosa  versicolor
versicolor 2.6e-15 -         
virginica  < 2e-16 8.3e-09   

P value adjustment method: bonferroni

The haematocrit measurements are the different for at least two water sources at significant level of 0.01. For details, Dushu and Control have significant difference.

inflen <- data.frame("HTC"= c(38, 40, 32, 36, 40, 40, 38, 40, 38, 40, 36, 40, 40, 35, 45,
                              56, 60, 50, 50, 50, 35, 40, 40, 55, 35,
                              40, 42, 38, 46, 36),
                     "Source" = c(rep("Control",15), 
                                  rep("Dushu",10),
                                  rep("Jinji",5)))
wg_aov1 <- aov(HTC~Source, data = inflen)
summary(wg_aov1)

            Df Sum Sq Mean Sq F value  Pr(>F)   
Source       2  450.5  225.23   6.669 0.00443 **
Residuals   27  911.8   33.77                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(LSD.test(wg_aov1, "Source", alpha = 0.01,console = T))


Study: wg_aov1 ~ "Source"

LSD t Test for HTC 

Mean Square Error:  33.7716 

Source,  means and individual ( 99 %) CI

             HTC      std  r      LCL      UCL Min Max
Control 38.53333 2.996824 15 34.37598 42.69069  32  45
Dushu   47.10000 8.987028 10 42.00830 52.19170  35  60
Jinji   40.40000 3.847077  5 33.19925 47.60075  36  46

Alpha: 0.01 ; DF Error: 27
Critical Value of t: 2.770683 

Groups according to probability of means differences and alpha level( 0.01 )

Treatments with the same letter are not significantly different.

             HTC groups
Dushu   47.10000      a
Jinji   40.40000     ab
Control 38.53333      b

           Length Class      Mode
statistics  4     data.frame list
parameters  5     data.frame list
means      10     data.frame list
comparison  0     -none-     NULL
groups      2     data.frame list

inflen_pt <- pairwise.t.test(inflen$HTC, inflen$Source, pool.sd = FALSE,var.equal = TRUE, p.adj = "bonferroni")
inflen_pt


    Pairwise comparisons using t tests with non-pooled SD 

data:  inflen$HTC and inflen$Source 

      Control Dushu 
Dushu 0.0066  -     
Jinji 0.8227  0.4191

P value adjustment method: bonferroni

inflen_pt$p.value < 0.01

      Control Dushu
Dushu    TRUE    NA
Jinji   FALSE FALSE

7 Non-parametric hypothesis tests

Wilcoxon Rank-Sum test: Used to compare whether the medians of two independent groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
Wilcoxon Signed-Rank test: Used to compare whether the medians of two related groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
Kruskal-Wallis test: Used to compare whether the medians of three or more independent groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.
Friedman test: Used to compare whether the medians of three or more related groups are equal. It is suitable for situations where the sample does not follow a normal distribution or where the variances are unequal.

Trichloroethylene (TCE) is reported by the EPA to be the most prevalent solvent contaminating groundwater at superfund sites in the United States. Exposure can result in effects to the immune and reproductive systems, liver, kidneys, central nervous system, and may affect fetal development during pregnancy. Long term exposures to TCE can increase the risk of kidney cancer. There is also evidence that TCE exposure can increase the risk for non-Hodgkin’s lymphoma and liver cancer.

The “tce” dataset shows the TCE concentrations (mg/L) at 10 randomly selected groundwater monitoring wells before remediation and the same wells after remediation. Download the tce.csv file. Is the median TCE concentration different after the remediation from before? Carry out a nonparametric test.

dtf <- read.csv('data/tce.csv')
wilcox.test(dtf$TCE.mg.per.L[dtf$Period == 'Before'], 
            dtf$TCE.mg.per.L[dtf$Period == 'After'], 
            paired = TRUE, conf.int = TRUE, 
            correct = FALSE)


    Wilcoxon signed rank exact test

data:  dtf$TCE.mg.per.L[dtf$Period == "Before"] and dtf$TCE.mg.per.L[dtf$Period == "After"]
V = 55, p-value = 0.001953
alternative hypothesis: true location shift is not equal to 0
95 percent confidence interval:
  6.8650 28.5715
sample estimates:
(pseudo)median 
         18.78

The iris dataset. Are the median sepal lengths equal among the three species? Carry out a nonparametric test.

kruskal.test(Sepal.Length ~ Species, data = iris)


    Kruskal-Wallis rank sum test

data:  Sepal.Length by Species
Kruskal-Wallis chi-squared = 96.937, df = 2, p-value < 2.2e-16

boxplot(Sepal.Length ~ Species, 
        data = iris, 
        horizontal = TRUE, 
        notch = TRUE, las = 1)

8 Multiple Linear Regression

The airquality dataset. Build a multiple linear regression model for predicting the ozone concentration (the dependent variable) with the solar radiation, wind speed, temperature, and month (the independent variables).

# Step 1: Visualization
dtf <- airquality[, -6]
dtf <- na.omit(dtf)
dtf_cor <- cor(dtf)
GGally::ggpairs(dtf)

pairs(dtf)

corrplot::corrplot(dtf_cor, 
                   order = 'AOE', 
                   type = "upper", 
                   method = "number")
corrplot::corrplot(dtf_cor, 
                   order = "AOE", 
                   type = "lower",
                   method = "ell", 
                   diag = FALSE, 
                   tl.pos = "l", 
                   cl.pos = "n", 
                   add = TRUE)

# Step 2: Maximal model
ozone_concentration_lm <- lm(Ozone ~ ., data = dtf)
summary(ozone_concentration_lm)


Call:
lm(formula = Ozone ~ ., data = dtf)

Residuals:
    Min      1Q  Median      3Q     Max 
-35.870 -13.968  -2.671   9.553  97.918 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -58.05384   22.97114  -2.527   0.0130 *  
Solar.R       0.04960    0.02346   2.114   0.0368 *  
Wind         -3.31651    0.64579  -5.136 1.29e-06 ***
Temp          1.87087    0.27363   6.837 5.34e-10 ***
Month        -2.99163    1.51592  -1.973   0.0510 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 20.9 on 106 degrees of freedom
Multiple R-squared:  0.6199,    Adjusted R-squared:  0.6055 
F-statistic: 43.21 on 4 and 106 DF,  p-value: < 2.2e-16

confint(ozone_concentration_lm)

                    2.5 %       97.5 %
(Intercept) -1.035964e+02 -12.51131722
Solar.R      3.088529e-03   0.09610512
Wind        -4.596849e+00  -2.03616955
Temp         1.328372e+00   2.41337586
Month       -5.997092e+00   0.01383629

anova(ozone_concentration_lm)

Analysis of Variance Table

Response: Ozone
           Df Sum Sq Mean Sq F value    Pr(>F)    
Solar.R     1  14780   14780 33.8357 6.427e-08 ***
Wind        1  39969   39969 91.5037 5.327e-16 ***
Temp        1  19050   19050 43.6117 1.639e-09 ***
Month       1   1701    1701  3.8946   0.05104 .  
Residuals 106  46302     437                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Step 3: Check the Multicollinearity
faraway::vif(dtf[, -1])

 Solar.R     Wind     Temp    Month 
1.151409 1.329309 1.712455 1.256371

# Step 4: simplify the model using Backward selection
get_p <- function(model){
  smry2 <- data.frame(summary(model)$coefficients)
  smry2[order(-smry2$Pr...t..), ]
}
get_p(ozone_concentration_lm)

                Estimate  Std..Error   t.value     Pr...t..
Month        -2.99162786  1.51592267 -1.973470 5.104492e-02
Solar.R       0.04959683  0.02345827  2.114258 3.683836e-02
(Intercept) -58.05383883 22.97114118 -2.527251 1.297198e-02
Wind         -3.31650940  0.64578918 -5.135591 1.286145e-06
Temp          1.87087379  0.27363201  6.837189 5.337230e-10

lm2 <- update(ozone_concentration_lm,.~.-Month)
get_p(lm2)

                Estimate  Std..Error   t.value     Pr...t..
Solar.R       0.05982059  0.02318647  2.579979 1.123664e-02
(Intercept) -64.34207893 23.05472435 -2.790841 6.226638e-03
Wind         -3.33359131  0.65440710 -5.094063 1.515934e-06
Temp          1.65209291  0.25352979  6.516366 2.423506e-09

lm3 <- step(ozone_concentration_lm, direction = "both")

Start:  AIC=679.71
Ozone ~ Solar.R + Wind + Temp + Month

          Df Sum of Sq   RSS    AIC
<none>                 46302 679.71
- Month    1    1701.2 48003 681.71
- Solar.R  1    1952.6 48254 682.29
- Wind     1   11520.5 57822 702.37
- Temp     1   20419.5 66721 718.26

# Step 5: report the results
summary(lm2)


Call:
lm(formula = Ozone ~ Solar.R + Wind + Temp, data = dtf)

Residuals:
    Min      1Q  Median      3Q     Max 
-40.485 -14.219  -3.551  10.097  95.619 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -64.34208   23.05472  -2.791  0.00623 ** 
Solar.R       0.05982    0.02319   2.580  0.01124 *  
Wind         -3.33359    0.65441  -5.094 1.52e-06 ***
Temp          1.65209    0.25353   6.516 2.42e-09 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 21.18 on 107 degrees of freedom
Multiple R-squared:  0.6059,    Adjusted R-squared:  0.5948 
F-statistic: 54.83 on 3 and 107 DF,  p-value: < 2.2e-16

stargazer::stargazer(lm2, type = 'text')


===============================================
                        Dependent variable:    
                    ---------------------------
                               Ozone           
-----------------------------------------------
Solar.R                       0.060**          
                              (0.023)          
                                               
Wind                         -3.334***         
                              (0.654)          
                                               
Temp                         1.652***          
                              (0.254)          
                                               
Constant                    -64.342***         
                             (23.055)          
                                               
-----------------------------------------------
Observations                    111            
R2                             0.606           
Adjusted R2                    0.595           
Residual Std. Error      21.181 (df = 107)     
F Statistic           54.834*** (df = 3; 107)  
===============================================
Note:               *p<0.1; **p<0.05; ***p<0.01

A scientist studies the social factors that influence heart disease. She surveys 500 towns and gather data on the percentage of people in each town who smoke, the percentage of people in each town who bike to work, and the percentage of people in each town who have heart disease. The data set is available as heart.data.csv. Perform multiple linear regression analysis.

dtf <- read.csv("data/heart.data.csv")
dtf <- dtf[, -1]
dtf <- na.omit(dtf)
dtf_cor <- cor(dtf)
GGally::ggpairs(dtf)

corrplot::corrplot(dtf_cor, 
                   order = 'AOE', 
                   type = "upper", 
                   method = "number")
corrplot::corrplot(dtf_cor, 
                   order = "AOE", 
                   type = "lower",
                   method = "ell", 
                   diag = FALSE, 
                   tl.pos = "l", 
                   cl.pos = "n", 
                   add = TRUE)

heart_disease_lm <- lm(heart.disease ~ ., data = dtf)
summary(heart_disease_lm)


Call:
lm(formula = heart.disease ~ ., data = dtf)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.1789 -0.4463  0.0362  0.4422  1.9331 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 14.984658   0.080137  186.99   <2e-16 ***
biking      -0.200133   0.001366 -146.53   <2e-16 ***
smoking      0.178334   0.003539   50.39   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.654 on 495 degrees of freedom
Multiple R-squared:  0.9796,    Adjusted R-squared:  0.9795 
F-statistic: 1.19e+04 on 2 and 495 DF,  p-value: < 2.2e-16

confint(heart_disease_lm)

                 2.5 %     97.5 %
(Intercept) 14.8272075 15.1421084
biking      -0.2028166 -0.1974495
smoking      0.1713800  0.1852878

anova(heart_disease_lm)

Analysis of Variance Table

Response: heart.disease
           Df Sum Sq Mean Sq F value    Pr(>F)    
biking      1 9090.6  9090.6 21251.7 < 2.2e-16 ***
smoking     1 1086.0  1086.0  2538.8 < 2.2e-16 ***
Residuals 495  211.7     0.4                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

faraway::vif(dtf[, 1:2])

  biking  smoking 
1.000229 1.000229

get_p <- function(model){
  smry2 <- data.frame(summary(model)$coefficients)
  smry2[order(-smry2$Pr...t..), ]
}
get_p(heart_disease_lm)

              Estimate  Std..Error    t.value      Pr...t..
smoking      0.1783339 0.003539307   50.38667 5.192235e-197
(Intercept) 14.9846580 0.080136914  186.98821  0.000000e+00
biking      -0.2001331 0.001365858 -146.52547  0.000000e+00

stargazer::stargazer(heart_disease_lm, type = 'text')


===============================================
                        Dependent variable:    
                    ---------------------------
                           heart.disease       
-----------------------------------------------
biking                       -0.200***         
                              (0.001)          
                                               
smoking                      0.178***          
                              (0.004)          
                                               
Constant                     14.985***         
                              (0.080)          
                                               
-----------------------------------------------
Observations                    498            
R2                             0.980           
Adjusted R2                    0.980           
Residual Std. Error      0.654 (df = 495)      
F Statistic         11,895.240*** (df = 2; 495)
===============================================
Note:               *p<0.1; **p<0.05; ***p<0.01

9 Logistic Regression

To be continue…

10 Poisson Regression

A scientist studies the effect of seizure drugs on the number of seizures during the first eight weeks of treatment. The dataset is available in the robust package:

Carry out a Poisson regression of the number of seizures after treatment (sumY) as the dependent variable, and the treatment conditions (Trt), Age, and the number of seizures before treatment (Base) as the independent variables.

data("breslow.dat", package = 'robust')
View(breslow.dat)

fit.breslow <- glm(sumY ~ Trt + Age + Base, data = breslow.dat, family = poisson())
summary(fit.breslow)


Call:
glm(formula = sumY ~ Trt + Age + Base, family = poisson(), data = breslow.dat)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-6.0569  -2.0433  -0.9397   0.7929  11.0061  

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)   1.9488259  0.1356191  14.370  < 2e-16 ***
Trtprogabide -0.1527009  0.0478051  -3.194   0.0014 ** 
Age           0.0227401  0.0040240   5.651 1.59e-08 ***
Base          0.0226517  0.0005093  44.476  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 2122.73  on 58  degrees of freedom
Residual deviance:  559.44  on 55  degrees of freedom
AIC: 850.71

Number of Fisher Scoring iterations: 5

exp(coef(fit.breslow))

 (Intercept) Trtprogabide          Age         Base 
   7.0204403    0.8583864    1.0230007    1.0229102

stargazer::stargazer(fit.breslow, type = 'text')


=============================================
                      Dependent variable:    
                  ---------------------------
                             sumY            
---------------------------------------------
Trtprogabide               -0.153***         
                            (0.048)          
                                             
Age                        0.023***          
                            (0.004)          
                                             
Base                       0.023***          
                            (0.001)          
                                             
Constant                   1.949***          
                            (0.136)          
                                             
---------------------------------------------
Observations                  59             
Log Likelihood             -421.354          
Akaike Inf. Crit.           850.707          
=============================================
Note:             *p<0.1; **p<0.05; ***p<0.01

equatiomatic::extract_eq(fit.breslow, use_coefs = TRUE)

\[ \log ({ \widehat{E( \operatorname{sumY} )} }) = 1.95 - 0.15(\operatorname{Trt}_{\operatorname{progabide}}) + 0.02(\operatorname{Age}) + 0.02(\operatorname{Base}) \]

fit.breslow.od <- glm(sumY ~ Trt + Age + Base, data = breslow.dat, family = quasipoisson())
summary(fit.breslow.od)


Call:
glm(formula = sumY ~ Trt + Age + Base, family = quasipoisson(), 
    data = breslow.dat)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-6.0569  -2.0433  -0.9397   0.7929  11.0061  

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.948826   0.465091   4.190 0.000102 ***
Trtprogabide -0.152701   0.163943  -0.931 0.355702    
Age           0.022740   0.013800   1.648 0.105085    
Base          0.022652   0.001747  12.969  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for quasipoisson family taken to be 11.76075)

    Null deviance: 2122.73  on 58  degrees of freedom
Residual deviance:  559.44  on 55  degrees of freedom
AIC: NA

Number of Fisher Scoring iterations: 5

exp(coef(fit.breslow.od))

 (Intercept) Trtprogabide          Age         Base 
   7.0204403    0.8583864    1.0230007    1.0229102

stargazer::stargazer(fit.breslow.od, type = 'text')


========================================
                 Dependent variable:    
             ---------------------------
                        sumY            
----------------------------------------
Trtprogabide           -0.153           
                       (0.164)          
                                        
Age                     0.023           
                       (0.014)          
                                        
Base                  0.023***          
                       (0.002)          
                                        
Constant              1.949***          
                       (0.465)          
                                        
----------------------------------------
Observations             59             
========================================
Note:        *p<0.1; **p<0.05; ***p<0.01

equatiomatic::extract_eq(fit.breslow.od, use_coefs = TRUE)

\[ \log ({ \widehat{E( \operatorname{sumY} )} }) = 1.95 - 0.15(\operatorname{Trt}_{\operatorname{progabide}}) + 0.02(\operatorname{Age}) + 0.02(\operatorname{Base}) \]

SessionInfo:

R version 4.2.3 (2023-03-15 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)

Matrix products: default

locale:
[1] LC_COLLATE=Chinese (Simplified)_China.utf8 
[2] LC_CTYPE=Chinese (Simplified)_China.utf8   
[3] LC_MONETARY=Chinese (Simplified)_China.utf8
[4] LC_NUMERIC=C                               
[5] LC_TIME=English_United States.1252         

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] agricolae_1.3-5    nycflights13_1.0.2 lubridate_1.9.2    forcats_1.0.0     
 [5] stringr_1.5.0      purrr_1.0.1        readr_2.1.4        tidyr_1.3.0       
 [9] tibble_3.2.1       ggplot2_3.4.2      tidyverse_2.0.0    openair_2.16-0    
[13] dplyr_1.1.0        latex2exp_0.9.6   

loaded via a namespace (and not attached):
 [1] nlme_3.1-162        RColorBrewer_1.1-3  backports_1.4.1    
 [4] tools_4.2.3         utf8_1.2.3          R6_2.5.1           
 [7] AlgDesign_1.2.1     mgcv_1.8-42         questionr_0.7.8    
[10] colorspace_2.1-0    withr_2.5.0         tidyselect_1.2.0   
[13] GGally_2.1.2        klaR_1.7-2          compiler_4.2.3     
[16] cli_3.6.1           labeling_0.4.2      equatiomatic_0.3.1 
[19] scales_1.2.1        hexbin_1.28.3       digest_0.6.31      
[22] minqa_1.2.5         rmarkdown_2.21      jpeg_0.1-10        
[25] pkgconfig_2.0.3     htmltools_0.5.4     lme4_1.1-31        
[28] labelled_2.11.0     fastmap_1.1.0       highr_0.10         
[31] maps_3.4.1          htmlwidgets_1.6.2   rlang_1.1.0        
[34] rstudioapi_0.14     shiny_1.7.4         generics_0.1.3     
[37] farver_2.1.1        combinat_0.0-8      jsonlite_1.8.4     
[40] magrittr_2.0.3      interp_1.1-4        Matrix_1.5-3       
[43] Rcpp_1.0.10         munsell_0.5.0       fansi_1.0.3        
[46] lifecycle_1.0.3     stringi_1.7.12      yaml_2.3.7         
[49] MASS_7.3-58.2       plyr_1.8.8          grid_4.2.3         
[52] promises_1.2.0.1    miniUI_0.1.1.1      deldir_1.0-6       
[55] lattice_0.20-45     stargazer_5.2.3     haven_2.5.2        
[58] splines_4.2.3       mapproj_1.2.11      hms_1.1.3          
[61] knitr_1.42          pillar_1.9.0        boot_1.3-28.1      
[64] glue_1.6.2          evaluate_0.20       latticeExtra_0.6-30
[67] nloptr_2.0.3        png_0.1-8           vctrs_0.6.2        
[70] tzdb_0.3.0          httpuv_1.6.9        gtable_0.3.3       
[73] reshape_0.8.9       xfun_0.37           mime_0.12          
[76] broom_1.0.4         xtable_1.8-4        later_1.3.0        
[79] faraway_1.0.8       cluster_2.1.4       corrplot_0.92      
[82] timechange_0.2.0    ellipsis_0.3.2